Pre-processing

  1. Adapters were trimmed using cutadapt v1.16
  2. Gene expression was quantified using salmon v1.3.0
  3. TPMs were obtained for the genes using tximport 1.20.0
library(dplyr)
library(ggplot2)
library(DESeq2)
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)

R Markdown

Define Color Scheme and plot them


Plasma 1:10
Viridis 1:10
Cividis 1:10
Magma 1:10

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

##This section of code will generate a summary table of the samples sequenced and their sequencing and alignment metrics

Summary of Data Metrics
Sample patient reads %Q30 Duplication rate % reads with adapter STAR alignment number percent aligned splices annotated salmon mapping
Bulk HTB314 71816976 94.3 57 2.6 63614504 88.58 27340654 85.6967
Bulk MDS268 31441538 93.0 24 2.3 25549835 81.26 5546270 58.2075
Bulk MDS280 28794421 93.0 24 2.3 23317205 80.98 5115546 58.8614
CD123+ HTB314 61568500 94.0 56 2.7 55359277 89.91 22831187 86.4197
CD123+ MDS268 32307881 93.0 27 3.3 26243441 81.23 7290861 64.2150
CD123+ MDS280 23745205 93.0 34 2.6 21035358 88.59 5668653 71.3245
CD123- HTB314 83260626 94.0 58 2.6 74804722 89.84 31876605 87.6505
CD123- MDS268 27917999 93.0 26 2.7 22835189 81.79 5793322 63.9284
CD123- MDS280 24767573 93.0 32 2.6 22467937 90.72 4773823 62.7193

##This section of code will import and format the data for DeSeq2

#to generate a vector of names and file locations
files<-file.path("~/Desktop/Jordan files/Counts/salmon/salmon/", list.files("~/Desktop/Jordan files/Counts/salmon/salmon/"), "quant.sf")
names(files)<-list.files("~/Desktop/Jordan files/Counts/salmon/salmon")

#to call in a gene_map this was derived by pulling it out of the fasta file with a grep command for ENST and ENSG and pasting them together
gene_map=read_csv("~/Desktop/Jordan files/Counts/salmon/gene_map.csv", col_names=c('enstid', 'ensgid'))

# import transcript level counts
txi.salmon.t<-tximport(files, type="salmon", txOut=TRUE)
txi.salmon.g<-tximport(files=files, type="salmon", tx2gene= gene_map, ignoreTxVersion = TRUE, countsFromAbundance = 'lengthScaledTPM' )
### this code works but if I remove the ignoreTxVersion I get an error, this may have to do with how I am generating my tx2gene file

# Extract counts only
counts <- txi.salmon.g$counts %>%
  as.data.frame()
#Extract TPM
tpms <- data.frame(txi.salmon.g$abundance)

##for clients the counts and tpm files should be written out

##This chunk of code sets an expression threshold for TMP where all samples have at least 5 counts

expressed_genes<-tpms %>% filter_all(all_vars(. > 5))

##This chunk of code generates a heatmap using the spearman method as well as a correlation heatmap ## PCA plot

exp.pca <- prcomp(t(log2(expressed_genes)))
exp.pca.summary <- summary(exp.pca)$importance
pc1var = round(exp.pca.summary[3,1] * 100, 1)
pc2var = round(exp.pca.summary[3,2] * 100 - pc1var, 1)
exp.pca.pc <- data.frame(exp.pca$x, sample = colnames(expressed_genes))

PC1 vs PC2

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   SampleName = col_character(),
##   FileName = col_character(),
##   patient = col_character(),
##   cellType = col_character()
## )

## 3. Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment ## Compare 123pos vs 123neg, 123neg vs bulk, and 123pos vs bulk #### sig = padj <0.01 and abs(l2fc) >0.5 ####

coldata<-dplyr::select(metadata, -FileName) 
coldata<-column_to_rownames(coldata, 'SampleName')
## need to confirm that all names are in the same order
all(rownames(coldata) %in% colnames(txi.salmon.g$counts))
## [1] TRUE
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] FALSE
coldata<-coldata[colnames(txi.salmon.g$counts),]
all(rownames(coldata) == colnames(txi.salmon.g$counts))
## [1] TRUE
dds <- DESeqDataSetFromTximport(txi.salmon.g,
                              colData = coldata,
                              design = ~ patient + cellType)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using just counts from tximport
### unfiltered data
dds.unfiltered <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.unfiltered <- results( dds.unfiltered)

##filtered data
dds.filtered<-dds[rowMins(counts(dds))>5,]
dds.filtered<-DESeq(dds.filtered)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.filtered<-results(dds.filtered)


### need to subset to do pairwise comparison unclear if this keeps the patient design aspect


res_123posvs123neg_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_unfiltered<-results( dds.unfiltered, contrast=c("cellType", "123neg", "bulk"))

res_123posvs123neg_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "123neg"))
res_123posvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123pos", "bulk"))
res_123negvsBulk_filtered<-results(dds.filtered, contrast=c("cellType", "123neg", "bulk"))

## look at the numbers of genes meeting threshold the log fold change call is not changing things
sum( res_123posvs123neg_unfiltered$pvalue < 0.01 & abs(res_123posvs123neg_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 1405
sum( res_123posvsBulk_unfiltered$pvalue < 0.01 & abs(res_123posvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2382
sum(res_123negvsBulk_unfiltered$pvalue < 0.01 & abs(res_123negvsBulk_unfiltered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 2336
sum( res_123posvs123neg_filtered$pvalue < 0.01 & abs(res_123posvs123neg_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 194
sum( res_123posvsBulk_filtered$pvalue < 0.01 & abs(res_123posvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 796
sum(res_123negvsBulk_filtered$pvalue < 0.01 & abs(res_123negvsBulk_filtered$log2FoldChange) >= 0.5, na.rm=TRUE )
## [1] 831

Volcano Plot

## Warning: Removed 840 rows containing missing values (geom_point).

## Warning: Removed 19307 rows containing missing values (geom_point).

## Warning: Removed 20003 rows containing missing values (geom_point).

## Warning: Removed 280 rows containing missing values (geom_point).

## Warning: Removed 20003 rows containing missing values (geom_point).

## Warning: Removed 280 rows containing missing values (geom_point).

###MA plots

pca_data<-vst(dds, blind=T)
ntop=500
rv <- rowVars(assay(pca_data))
select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(pca_data)[select,]))
percentVar <- data.frame(percentVar=pca$sdev^2/sum(pca$sdev^2))%>%
  mutate(pc=1:n())%>%dplyr::select(pc, percentVar)
pca_df<-pca$x%>%data.frame()%>%mutate(type=metadata$cellType)

alt_col_values=c("#88CCEE", "#CC6677", "#DDCC77")

PC1 vs PC2

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.